from pathlib import Path
from typing import Optional

import math
import numpy as np
import torch
import torch.nn.functional as F
from diffusers.models.attention_processor import Attention, AttnProcessor2_0
from diffusers.utils import deprecate

from layers.cache_base import CacheBase


class CustomAttnProcessor2_0(AttnProcessor2_0, CacheBase):
    r"""
    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
    """

    def __init__(self,
                 cur_t=None,
                 mode = "invert",
                 load_attn: bool = False,
                 use_cache: bool = False,
                 *args, **kwargs
                 ):
        if not hasattr(F, "scaled_dot_product_attention"):
            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
        super().__init__()
        self.t = cur_t
        self.mode = mode
        self.load_attn = load_attn
        self.use_cache = use_cache

        # for attention cache
        self.attn_cache = None
        self.fusion_mode = "min"  # "mean", "max" or "min"
        self.mean_factor = 1 / 10

    def set_cur_t(self, cur_t):
        self.t = cur_t

    def set_invert_or_generate(self, mode):
        self.mode = mode

    def set_load_mode(self, mode):
        self.load_attn = mode

    def reset_cache(self):
        del self.attn_cache
        self.attn_cache = None

    def save_cache_to_file(self):
        pass

    def load_cache_from_file(self):
        pass

    @torch.no_grad()
    def save_cache(self, q, k):
        pass

    @torch.no_grad()
    def load_cache(self, B, device="cuda"):
        # return self.attn_cache.repeat(B, 1, 1, 1).to(device, non_blocking=True) if self.attn_cache is not None else None
        # return self.attn_cache.repeat(2, 1, 1, 1).to(device, non_blocking=True) if self.attn_cache is not None else None
        return torch.cat([self.attn_cache, self.attn_cache]).to(device, non_blocking=True) if self.attn_cache is not None else None

    def __call__(
        self,
        attn: Attention,
        hidden_states: torch.Tensor,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        temb: Optional[torch.Tensor] = None,
        *args,
        **kwargs,
    ) -> torch.Tensor:
        if len(args) > 0 or kwargs.get("scale", None) is not None:
            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
            deprecate("scale", "1.0.0", deprecation_message)

        residual = hidden_states
        if attn.spatial_norm is not None:
            hidden_states = attn.spatial_norm(hidden_states, temb)

        input_ndim = hidden_states.ndim

        if input_ndim == 4:
            batch_size, channel, height, width = hidden_states.shape
            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)

        batch_size, sequence_length, _ = (
            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
        )

        if attention_mask is not None:
            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
            # scaled_dot_product_attention expects attention_mask shape to be
            # (batch, heads, source_length, target_length)
            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])

        if attn.group_norm is not None:
            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

        key = attn.to_k(encoder_hidden_states)
        value = attn.to_v(encoder_hidden_states)

        inner_dim = key.shape[-1]
        head_dim = inner_dim // attn.heads

        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

        if attn.norm_q is not None:
            query = attn.norm_q(query)
        if attn.norm_k is not None:
            key = attn.norm_k(key)

        # if during invert, save the attn cache for generate
        if not self.use_cache:
            hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask,
                                                           dropout_p=0.0, is_causal=False)
        elif self.mode == "invert" :
            hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask,
                                                           dropout_p=0.0, is_causal=False)
        elif self.mode == "generate":
            query[:int(batch_size // 2), ] = query[int(batch_size // 2):, ]
            key[:int(batch_size // 2), ] = key[int(batch_size // 2):, ]

            hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask,
                                                           dropout_p=0.0, is_causal=False)
        else:
            raise ValueError(f"Invalid mode: {self.mode}. Choose from 'invert' or 'generate'.")

        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
        hidden_states = hidden_states.to(query.dtype)

        # linear proj
        hidden_states = attn.to_out[0](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)

        if input_ndim == 4:
            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)

        if attn.residual_connection:
            hidden_states = hidden_states + residual

        hidden_states = hidden_states / attn.rescale_output_factor

        return hidden_states
